Grants: Data quality checksΒΆ
from decimal import Decimal
from re import sub
import pandas as pd
from pandas_profiling import ProfileReport
# some data cleaning functions
def parseMoney(money):
money = Decimal(sub(r'[^\d.]', '', money))
return money
df = pd.read_csv('../data/grants_data_raw.csv')
# inspect it
df.head()
| round_number | round_start_date | round_end_date | grant_title | grant_id | region | category | url | match_amount | num_contributions | num_unique_contributors | crowdfund_amount_contributions_usd | total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12 | 2021-12-01 | 2021-12-16 | Coin Center is educating policy makers about p... | 1668 | north_america | Community | https://gitcoin.co/grants/1668/coin-center-is-... | $340,000.00 | 6914 | 5727 | $103,838.93 | $443,838.93 |
| 1 | 12 | 2021-12-01 | 2021-12-16 | Electronic Frontier Foundation | 3974 | north_america | Community | https://gitcoin.co/grants/3974/electronic-fron... | $228,273.84 | 3648 | 3331 | $58,715.04 | $286,988.88 |
| 2 | 12 | 2021-12-01 | 2021-12-16 | The Tor Project | 2805 | undefined | Infra Tech | https://gitcoin.co/grants/2805/the-tor-project | $188,294.80 | 3497 | 3069 | $95,279.64 | $283,574.44 |
| 3 | 12 | 2021-12-01 | 2021-12-16 | Longevity Prize (by VitaDAO) | 4083 | europe | Grants Round 12 | https://gitcoin.co/grants/4083/longevity-prize... | $176,195.45 | 920 | 857 | $7,815.29 | $184,010.75 |
| 4 | 12 | 2021-12-01 | 2021-12-16 | Rotki - The portfolio tracker and accounting t... | 149 | europe | dApp Tech | https://gitcoin.co/grants/149/rotki-the-portfo... | $129,131.34 | 4684 | 4311 | $25,373.62 | $154,504.96 |
df.dtypes
round_number int64
round_start_date object
round_end_date object
grant_title object
grant_id int64
region object
category object
url object
match_amount object
num_contributions int64
num_unique_contributors int64
crowdfund_amount_contributions_usd object
total object
dtype: object
# parse these into float columns
df['match_amount'] = pd.to_numeric(df['match_amount'].str.replace('[^.0-9]', ''))
df['crowdfund_amount_contributions_usd'] = pd.to_numeric(df['crowdfund_amount_contributions_usd'].str.replace('[^.0-9]', ''))
df['total'] = pd.to_numeric(df['total'].str.replace('[^.0-9]', ''))
# df['match_amount'] = df['match_amount'].apply(lambda x: parseMoney(x))
# df['crowdfund_amount_contributions_usd'] = df['crowdfund_amount_contributions_usd'].apply(lambda x: parseMoney(x))
# df['total'] = df['total'].apply(lambda x: parseMoney(x))
/var/folders/x4/w5cvp_v94hj1jb6h056qs6r80000gp/T/ipykernel_17241/1127562104.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
df['match_amount'] = pd.to_numeric(df['match_amount'].str.replace('[^.0-9]', ''))
/var/folders/x4/w5cvp_v94hj1jb6h056qs6r80000gp/T/ipykernel_17241/1127562104.py:3: FutureWarning: The default value of regex will change from True to False in a future version.
df['crowdfund_amount_contributions_usd'] = pd.to_numeric(df['crowdfund_amount_contributions_usd'].str.replace('[^.0-9]', ''))
/var/folders/x4/w5cvp_v94hj1jb6h056qs6r80000gp/T/ipykernel_17241/1127562104.py:4: FutureWarning: The default value of regex will change from True to False in a future version.
df['total'] = pd.to_numeric(df['total'].str.replace('[^.0-9]', ''))
df.describe()
| round_number | grant_id | match_amount | num_contributions | num_unique_contributors | crowdfund_amount_contributions_usd | total | |
|---|---|---|---|---|---|---|---|
| count | 5906.000000 | 5906.000000 | 5906.000000 | 5906.000000 | 5906.000000 | 4715.000000 | 4976.000000 |
| mean | 9.399763 | 1449.218591 | 1146.815586 | 228.686590 | 186.696072 | 1984.569552 | 3241.627518 |
| std | 2.629641 | 1104.701308 | 7916.396923 | 934.435539 | 736.134848 | 11921.769330 | 16768.091827 |
| min | 1.000000 | 12.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 8.000000 | 470.250000 | 0.000000 | 2.000000 | 2.000000 | 15.775000 | 12.590000 |
| 50% | 10.000000 | 1240.500000 | 0.660000 | 9.000000 | 8.000000 | 119.550000 | 112.905000 |
| 75% | 12.000000 | 2202.000000 | 59.182500 | 57.000000 | 47.000000 | 847.515000 | 1058.295000 |
| max | 12.000000 | 4367.000000 | 340000.000000 | 22240.000000 | 12317.000000 | 652919.460000 | 674332.580000 |
# profile the data
profile = ProfileReport(df, title="Grants Data Profile")
profile
# write out the clean data
df.to_csv('../data/grants_data.csv', index=False)